User Profiling and Segmentation using Python¶

Approach¶

  1. Determine the Aim
  2. Data Collection
  3. Feature Engineering
  4. Segmentation
  5. Profiling by Segment
In [1]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
In [2]:
# Read data from CSV file
data = pd.read_csv('user_profiles_for_ads.csv')
In [3]:
# Display top 10 recorrds
data.head()
Out[3]:
User ID Age Gender Location Language Education Level Likes and Reactions Followed Accounts Device Usage Time Spent Online (hrs/weekday) Time Spent Online (hrs/weekend) Click-Through Rates (CTR) Conversion Rates Ad Interaction Time (sec) Income Level Top Interests
0 1 25-34 Female Suburban Hindi Technical 5640 190 Mobile Only 4.5 1.7 0.193 0.067 25 20k-40k Digital Marketing
1 2 65+ Male Urban Hindi PhD 9501 375 Tablet 0.5 7.7 0.114 0.044 68 0-20k Data Science
2 3 45-54 Female Suburban Spanish Technical 4775 187 Mobile Only 4.5 5.6 0.153 0.095 80 60k-80k Fitness and Wellness
3 4 35-44 Female Rural Spanish PhD 9182 152 Desktop Only 3.1 4.2 0.093 0.061 65 100k+ Gaming, DIY Crafts
4 5 25-34 Female Urban English Technical 6848 371 Mobile Only 2.0 3.8 0.175 0.022 99 20k-40k Fitness and Wellness, Investing and Finance, G...

Column Description¶

  • User ID : Unique identifier for each user
  • Age : Age range of the user
  • Gender : Gender of the user
  • Location : User's location type (Urban, Suburban, Rural)
  • Language : Primary language of the user
  • Education : Highest education
  • Like and Reaction : Number of likes and reaction
  • Followed Accounts : Number of accounts followed by user
  • Device Type : Primary device of the user (Mobile, Desktop, Tablet)
  • Time Spent Online (HRS / Weekday) : AVG time spent online on weekdays
  • Time Spent Online (HRS / Weekend) : AVG time spent online on weekends
  • Click Through Rates (CTR) : The percentage of ad impression that leads to clicks
  • Conversion Rates : The percentage of clicks that lead to conversions / actions
  • Ad Interaction Time (Sec) : AVG time spent interacting with ads in seconds
  • Income Level : User's income level
  • Top Interests : Primary interests of the user
In [4]:
# Check if there any missing values
data.isnull().sum()
Out[4]:
User ID                            0
Age                                0
Gender                             0
Location                           0
Language                           0
Education Level                    0
Likes and Reactions                0
Followed Accounts                  0
Device Usage                       0
Time Spent Online (hrs/weekday)    0
Time Spent Online (hrs/weekend)    0
Click-Through Rates (CTR)          0
Conversion Rates                   0
Ad Interaction Time (sec)          0
Income Level                       0
Top Interests                      0
dtype: int64

Exploratory Data Analysis¶

In [5]:
# Setting the aesthetic style of the plots
sns.set_style("whitegrid")
In [6]:
# Creating subplots for the demographic distributions
fig, axes = plt.subplots(2, 2, figsize = (18, 12))
fig.suptitle('Distribution of Key Demographic Variables')

# Age Distribution
sns.countplot(ax = axes[0, 0], x = 'Age', data = data, palette = 'coolwarm')
axes[0, 0].set_title('Age Distribution')
axes[0, 0].tick_params(axis = 'x', rotation = 45)

# Gender Distribution
sns.countplot(ax = axes[0, 1], x = 'Gender', data = data, palette = 'coolwarm')
axes[0, 1].set_title('Gender Distribution')

# Education Level Distribution
sns.countplot(ax = axes[1,0], x = 'Education Level', data = data, palette = 'coolwarm')
axes[1, 0].tick_params(axis = 'x', rotation = 45)

# Income Level Distribution
sns.countplot(ax = axes[1, 1], x = 'Income Level', data = data, palette = 'coolwarm')
axes[1, 1].tick_params(axis = 'x', rotation = 45)

plt.tight_layout(rect = [0, 0.3, 1, 0.95])
plt.show()
In [7]:
# Device Usage Distribution
plt.figure(figsize = (7.5, 3.5))
sns.countplot(x = 'Device Usage', data = data, palette = 'coolwarm')
plt.title('Device Usage Distribution')
plt.show()
In [8]:
# Creating subplots for user online behaviour and ad interaction metrics
fig, axes = plt.subplots(3, 2, figsize = (18, 15))
fig.suptitle('User Online Behaviour and Ad Interaction Metrics')

# Time spent online on weekdays
sns.histplot(ax = axes[0, 0], x = 'Time Spent Online (hrs/weekday)', data = data, bins = 20, kde = True, color = 'skyblue')
axes[0, 0].set_title('Time Spent Online on Weekdays')

# Time spent online on weekends
sns.histplot(ax = axes[0, 1], x = 'Time Spent Online (hrs/weekend)', data = data, bins = 20, kde = True, color = 'orange')
axes[0, 1].set_title('Time Spent Online on Weekend')

# Likes and Reactions
sns.histplot(ax = axes[1, 0], x = 'Likes and Reactions', data = data, bins = 20, kde = True, color = 'green')
axes[1, 0].set_title('Likes and Reactions')

# Click-Through Rates
sns.histplot(ax = axes[1, 1], x = 'Click-Through Rates (CTR)', data = data, bins = 20, kde = True, color = 'Red')
axes[1, 1].set_title('Click-Through Rates (CTR)')

# Conversion Rates
sns.histplot(ax = axes[2, 0], x = 'Conversion Rates', data = data, bins = 20, kde = True, color = 'purple')
axes[2, 0].set_title('Conversion Rates')

# Ad Interaction Time
sns.histplot(ax = axes[2, 1], x = 'Ad Interaction Time (sec)', data = data, bins = 20, kde = True, color = 'brown')
axes[2, 1].set_title('Ad Interaction Time (sec)')

plt.tight_layout(rect = [0, 0.3, 1, 0.95])
plt.show()
In [9]:
# Identify most common interest among users
In [10]:
# Import Counter
from collections import Counter
In [11]:
# Splitting the 'Top Interest' column and creating a list of all interests
interests_list = data['Top Interests'].str.split(', ').sum()
In [12]:
# Counting the frequency of each interest
interests_counter = Counter(interests_list)
In [13]:
# Converting the counter object to a dataframe for easier plotting
interests_df = pd.DataFrame(interests_counter.items(), 
                            columns = ['Interest', 'Frequency']).sort_values(by = 'Frequency', ascending = False)
In [14]:
# Plotting the most common interests
plt.figure(figsize = (7.5, 3.5))
sns.barplot(x = 'Frequency', y = 'Interest', data = interests_df.head(10), palette = 'coolwarm')
plt.title('Top 10 User Interests')
plt.xlabel('Frequency')
plt.ylabel('Interest')
plt.show()

User Profiling and Segmentation¶

Segmentation can be based on criteria such as :

  1. Demographics : Age, Gender, Education Level, Income Level
  2. Behavioural : Time Spent Online, Likes and Reactions, CTR, Conversion Rates
  3. Interests : Aligning Ad Content with Top Interests Identified
In [15]:
# Import required libraries for clustering
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
In [16]:
# Selecting features for clustering
features = ['Age', 'Gender', 'Income Level', 'Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)',
           'Likes and Reactions', 'Click-Through Rates (CTR)']
In [17]:
# Separating the features we want to consider for clustering
X = data[features]
In [18]:
# Defining preprocessing for numerical and categorical features
numeric_features = ['Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)',
                   'Likes and Reactions', 'Click-Through Rates (CTR)']
numeric_transformer = StandardScaler()
In [19]:
categorical_features = ['Age', 'Gender', 'Income Level']
categorical_transformer = OneHotEncoder()
In [20]:
# Combining Preprocessing Steps
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
In [21]:
# creating a preprocessing and clustering pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('cluster', KMeans(n_clusters=5, random_state=42))])

pipeline.fit(X)
cluster_labels = pipeline.named_steps['cluster'].labels_
data['Cluster'] = cluster_labels
C:\Users\Mahesh S Valanju\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
In [22]:
data.head()
Out[22]:
User ID Age Gender Location Language Education Level Likes and Reactions Followed Accounts Device Usage Time Spent Online (hrs/weekday) Time Spent Online (hrs/weekend) Click-Through Rates (CTR) Conversion Rates Ad Interaction Time (sec) Income Level Top Interests Cluster
0 1 25-34 Female Suburban Hindi Technical 5640 190 Mobile Only 4.5 1.7 0.193 0.067 25 20k-40k Digital Marketing 2
1 2 65+ Male Urban Hindi PhD 9501 375 Tablet 0.5 7.7 0.114 0.044 68 0-20k Data Science 1
2 3 45-54 Female Suburban Spanish Technical 4775 187 Mobile Only 4.5 5.6 0.153 0.095 80 60k-80k Fitness and Wellness 0
3 4 35-44 Female Rural Spanish PhD 9182 152 Desktop Only 3.1 4.2 0.093 0.061 65 100k+ Gaming, DIY Crafts 3
4 5 25-34 Female Urban English Technical 6848 371 Mobile Only 2.0 3.8 0.175 0.022 99 20k-40k Fitness and Wellness, Investing and Finance, G... 2
In [23]:
# Find mean of numerical features and mode of categorical features to find defining characteristics
In [24]:
# Computing mean values of numerical features for each cluster
cluster_means = data.groupby('Cluster')[numeric_features].mean()

for feature in categorical_features:
    mode_series = data.groupby('Cluster')[feature].agg(lambda x:x.mode()[0])
    cluster_means[feature] = mode_series

cluster_means
Out[24]:
Time Spent Online (hrs/weekday) Time Spent Online (hrs/weekend) Likes and Reactions Click-Through Rates (CTR) Age Gender Income Level
Cluster
0 3.911111 5.212963 2409.620370 0.149588 25-34 Female 80k-100k
1 1.559394 6.002424 5005.121212 0.179836 35-44 Male 80k-100k
2 3.019737 2.584211 6861.587719 0.170614 25-34 Male 20k-40k
3 3.080882 5.774510 7457.602941 0.067971 25-34 Female 100k+
4 1.809626 3.839572 3021.219251 0.056594 45-54 Female 0-20k
In [25]:
""" Assign each cluster a name that reflects its most defining characteristics based on :
1. the mean values of numerical features and 
2. most frequent categories for categorical features """
Out[25]:
' Assign each cluster a name that reflects its most defining characteristics based on :\n1. the mean values of numerical features and \n2. most frequent categories for categorical features '

Cluster Names¶

Cluster 0 - "Weekend Warriors" : High weekend online activity, moderate likes and reactions, predominantly male, age group 25-34, income level 80k-100k

Cluster 1 - "Engaged Professionals" : Balanced online activity, high likes and reactions, predominantly male, age group 25-34, high income (100k+)

Cluster 2 - "Low-Key Users" : Moderate to high weekend online activity, moderate likes and reactions, predominantly male, age group 25-34, income level 60k-80k, lower CTR

Cluster 3 - “Active Explorers” : High overall online activity, lower likes and reactions, predominantly female, age group 25-34, income level 60k-80k

Cluster 4 – “Budget Browsers” : Moderate online activity, lowest likes and reactions, predominantly female, age group 25-34, lowest income level (0-20k), lower CTR

Prepare data for radar chart¶

In [26]:
# Import required libraries
import numpy as np
In [27]:
# Preparing data for radar chart
features_to_plot = ['Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)',
                   'Likes and Reactions', 'Click-Through Rates (CTR)']
labels = np.array(features_to_plot)

# Creating a dataframe for the radar chart
radar_df = cluster_means[features_to_plot].reset_index()

# Normalize the data
radar_df_normalized = radar_df.copy()
for feature in features_to_plot:
    radar_df_normalized[feature] = (radar_df[feature] - radar_df[feature].min()) / (radar_df[feature].max()
                                                                                   - radar_df[feature].min())

# Adding a full circle for plotting
#radar_df_normalized = radar_df_normalized.append(radar_df_normalized.iloc[0])
radar_df_normalized = pd.concat([radar_df_normalized, radar_df_normalized.iloc[0]])

# Assigning names to segments
segment_names = ['Weekend Warriors', 'Engaged Professionals', 'Low-Key Users', 'Active Explorers', 'Budget Browsers']

Create a visualization reflecting these segments¶

In [28]:
# Import required libraries
import plotly.graph_objects as go
In [29]:
fig = go.Figure()
In [30]:
# Loop through each segment to add to the radar chart
for i, segment in enumerate(segment_names):
    fig.add_trace(go.Scatterpolar(
        
    # Add the first value at the end to close the radar chart
    r = radar_df_normalized.iloc[i][features_to_plot].values.tolist() + 
        [radar_df_normalized.iloc[i][features_to_plot].values[0]],
    
    # Add the first label at the end to close the radar chart
    theta = labels.tolist() + [labels[0]],
    fill = 'toself',
    name = segment,
    
    # Adding hover text for each feature
    hoverinfo = 'text',
    text = [f'{label} : {value:.2f}' for label, 
            value in zip(features_to_plot, radar_df_normalized.iloc[i][features_to_plot])] + 
            [f'{labels[0]} : {radar_df_normalized.iloc[i][features_to_plot][0]:.2f}']
))
In [31]:
# Update the layout to finalize the radar chart
fig.update_layout(
    polar = dict(
        radialaxis = dict(
            visible = True,
            range = [0, 1]
        )),
    showlegend = True,
    title = 'User Segments Profile'
)

fig.show()